{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Arboles de decisión: Hiperparámetros, Random Forest y Optimización de Parámetros"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Pregnancies</th>\n",
       "      <th>Glucose</th>\n",
       "      <th>BloodPressure</th>\n",
       "      <th>SkinThickness</th>\n",
       "      <th>Insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>DiabetesPedigreeFunction</th>\n",
       "      <th>Age</th>\n",
       "      <th>Outcome</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>148</td>\n",
       "      <td>72</td>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>85</td>\n",
       "      <td>66</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>183</td>\n",
       "      <td>64</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>89</td>\n",
       "      <td>66</td>\n",
       "      <td>23</td>\n",
       "      <td>94</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>137</td>\n",
       "      <td>40</td>\n",
       "      <td>35</td>\n",
       "      <td>168</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \\\n",
       "0            6      148             72             35        0  33.6   \n",
       "1            1       85             66             29        0  26.6   \n",
       "2            8      183             64              0        0  23.3   \n",
       "3            1       89             66             23       94  28.1   \n",
       "4            0      137             40             35      168  43.1   \n",
       "\n",
       "   DiabetesPedigreeFunction  Age  Outcome  \n",
       "0                     0.627   50        1  \n",
       "1                     0.351   31        0  \n",
       "2                     0.672   32        1  \n",
       "3                     0.167   21        0  \n",
       "4                     2.288   33        1  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(os.path.join('../Datasets/diabetes.csv'))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(537, 7) (231, 7) (537,) (231,)\n"
     ]
    }
   ],
   "source": [
    "feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age','Glucose','BloodPressure','DiabetesPedigreeFunction']\n",
    "X = df[feature_cols]\n",
    "Y = df[\"Outcome\"]\n",
    "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1) # 70% training, 30% test\n",
    "print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                       max_depth=None, max_features=None, max_leaf_nodes=None,\n",
       "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                       min_samples_leaf=1, min_samples_split=2,\n",
       "                       min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                       random_state=None, splitter='best')"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# baseline no incluye poda (max_depth)\n",
    "treev1 = DecisionTreeClassifier()\n",
    "treev1.fit(X_train, Y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,\n",
       "       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,\n",
       "       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,\n",
       "       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0,\n",
       "       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,\n",
       "       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,\n",
       "       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,\n",
       "       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,\n",
       "       0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,\n",
       "       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y_pred = treev1.predict(X_test)\n",
    "Y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy del clasificador - version 1 : 0.68\n",
      "matriz de confusión del clasificador - version 1: \n",
      " [[111  35]\n",
      " [ 40  45]]\n",
      "precision del clasificador - version 1 : 0.56\n",
      "recall del clasificador - version 1 : 0.53\n",
      "f1 del clasificador - version 1 : 0.55\n"
     ]
    }
   ],
   "source": [
    "def metricas_desempenio(tree):\n",
    "    print('accuracy del clasificador - version 1 : {0:.2f}'.format(accuracy_score(Y_test, tree.predict(X_test))))\n",
    "    print('matriz de confusión del clasificador - version 1: \\n {0}'.format(confusion_matrix(Y_test, tree.predict(X_test))))\n",
    "    print('precision del clasificador - version 1 : {0:.2f}'.format(precision_score(Y_test, tree.predict(X_test))))\n",
    "    print('recall del clasificador - version 1 : {0:.2f}'.format(recall_score(Y_test, tree.predict(X_test))))\n",
    "    print('f1 del clasificador - version 1 : {0:.2f}'.format(f1_score(Y_test, tree.predict(X_test))))\n",
    "metricas_desempenio(treev1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',\n",
       "                       max_depth=3, max_features=None, max_leaf_nodes=None,\n",
       "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                       min_samples_leaf=1, min_samples_split=2,\n",
       "                       min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                       random_state=None, splitter='best')"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Ajustar algunos hiperparámetros\n",
    "tree_v2 = DecisionTreeClassifier(criterion=\"entropy\", max_depth=3)\n",
    "tree_v2.fit(X_train, Y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy del clasificador - version 1 : 0.77\n",
      "matriz de confusión del clasificador - version 1: \n",
      " [[124  22]\n",
      " [ 31  54]]\n",
      "precision del clasificador - version 1 : 0.71\n",
      "recall del clasificador - version 1 : 0.64\n",
      "f1 del clasificador - version 1 : 0.67\n"
     ]
    }
   ],
   "source": [
    "metricas_desempenio(tree_v2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Hiperparámetros para ajustar la complejidad del modelo\n",
    "\n",
    "__[DecisionTreeClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)__\n",
    "* class_weight=None importancia relativa de los valores de clasificación\n",
    "* criterion='entropy'/'gini'\n",
    "* max_depth=3 distancia max entre a raiz y las hojas\n",
    "* max_features=None numero max de variables a considerar\n",
    "* max_leaf_nodes=20 numero max de hojas\n",
    "* min_impurity_decrease=0.0\n",
    "* min_impurity_split=None (Deprecado)\n",
    "* min_samples_leaf=1 Podar si quedan menos que este numero de ejemplos \n",
    "* min_samples_split=2 Continuar si quedan al menos esta cantidad de ejemplos\n",
    "* min_weight_fraction_leaf=0.0 Porcentaje minimo de ejemplo para continuar\n",
    "\n",
    "\n",
    "<img src=\"./img/28-complejidad-vs-accuracy.png\" width=\"600px\"/>\n",
    "\n",
    "**Más allá de cierto umbral, la complejidad del modelo afecta negativamente el desempeño debido al sobreajuste**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sobreajuste\n",
    "\n",
    "__[Sobreajuste](https://es.wikipedia.org/wiki/Sobreajuste)__\n",
    "\n",
    "<img src=\"./img/29-overfitting-sobreajuste.png\" width=\"600px\"/>\n",
    "\n",
    "<img src=\"./img/30.0-underfitting.png\" width=\"600px\"/>\n",
    "\n",
    "<img src=\"./img/30.1-origen-del-sobreajuste.png\" width=\"600px\"/>\n",
    "\n",
    "**Como evitar?**\n",
    "En el caso particular de los árboles de decisión, reducir nodos del arbol cuando no incrementan los indicadores con una buena cantidad de datos de prueba (**poda - pruning**)\n",
    "\n",
    "### Ensemble learning\n",
    "\n",
    "__[Ensemble learning](https://en.wikipedia.org/wiki/Ensemble_learning)__\n",
    "\n",
    "<img src=\"./img/31-ensemble-learning.png\" width=\"600px\"/>\n",
    "\n",
    "<img src=\"./img/32-ensemble-arboles.png\" width=\"600px\"/>\n",
    "\n",
    "<img src=\"./img/33-ensemble-arboles.png\" width=\"600px\"/>\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Random Forest\n",
    "\n",
    "__[RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)__\n",
    "\n",
    "Se crean varios árboles **INDEPENDIENTES** variando los casos/observaciones del conjunto de entrenamiento y/o las variables empleadas durante el proceso de entrenamiento.\n",
    "\n",
    "Las predicciones de cada modelo (árbol) tienen el mismo peso y el resultado final es el voto de mayoría\n",
    "\n",
    "**Parámetros:**\n",
    "* **n_estimators** número de clasificadores, árboles en este caso.\n",
    "\n",
    "Los valores adecuados para este y otros parámetros se obtienen via experimentación (prueba y error). Si es posible, se recomienda tener varios conjuntos de prueba para seleccionar el modelo con el mejor desempeño (promedio) en todos los conjunto de prueba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
       "                       criterion='gini', max_depth=None, max_features='auto',\n",
       "                       max_leaf_nodes=None, max_samples=None,\n",
       "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                       min_samples_leaf=1, min_samples_split=2,\n",
       "                       min_weight_fraction_leaf=0.0, n_estimators=10,\n",
       "                       n_jobs=None, oob_score=False, random_state=None,\n",
       "                       verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "#Ajustar n_estimators puede reducir la posiblidad de overfitting\n",
    "tree_v3 = RandomForestClassifier(n_estimators=10)\n",
    "tree_v3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
       "                       criterion='gini', max_depth=None, max_features='auto',\n",
       "                       max_leaf_nodes=None, max_samples=None,\n",
       "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                       min_samples_leaf=1, min_samples_split=2,\n",
       "                       min_weight_fraction_leaf=0.0, n_estimators=10,\n",
       "                       n_jobs=None, oob_score=False, random_state=None,\n",
       "                       verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tree_v3.fit(X_train, Y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy del clasificador - version 1 : 0.77\n",
      "matriz de confusión del clasificador - version 1: \n",
      " [[132  14]\n",
      " [ 38  47]]\n",
      "precision del clasificador - version 1 : 0.77\n",
      "recall del clasificador - version 1 : 0.55\n",
      "f1 del clasificador - version 1 : 0.64\n"
     ]
    }
   ],
   "source": [
    "metricas_desempenio(tree_v3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Gradient Boosted Trees\n",
    "Los arboles se construyen en **secuencia** a partir de una fracción del conjunto de entrenamiento; la idea central es que el siguiente árbol corriga los errores del anterior.:\n",
    "\n",
    "Inicialmente, todos los ejemplos tienen la misma probabilidad de ser seleccionados. A partir del segundo árbol, los ejemplos que fueros incorrectamente clasificados por el árbol anterior tienen mayor probabilidad de ser seleccionados. (para detectar patrones que no fueron detectados por el anterior)\n",
    "\n",
    "En consecuencia, cada árbol se crea a partir de una fracción diferente del conjunto de entrenamiento. En la colección final, la clasificación de cada árbol tiene un peso mayor en función del desempeño obtenido con el conjuto de entrenamiento.\n",
    "\n",
    "__[py-xgboost](https://anaconda.org/anaconda/py-xgboost)__\n",
    "\n",
    "__[xgboost](https://xgboost.readthedocs.io/en/latest/python/python_api.html)__\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "from xgboost.sklearn import XGBClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,\n",
       "              colsample_bynode=None, colsample_bytree=None, gamma=None,\n",
       "              gpu_id=None, importance_type='gain', interaction_constraints=None,\n",
       "              learning_rate=None, max_delta_step=None, max_depth=None,\n",
       "              min_child_weight=None, missing=nan, monotone_constraints=None,\n",
       "              n_estimators=10, n_jobs=None, num_parallel_tree=None,\n",
       "              objective='binary:logistic', random_state=None, reg_alpha=None,\n",
       "              reg_lambda=None, scale_pos_weight=None, subsample=None,\n",
       "              tree_method=None, validate_parameters=False, verbosity=None)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tree_v4 = XGBClassifier(n_estimators=10)\n",
    "tree_v4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,\n",
       "              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n",
       "              importance_type='gain', interaction_constraints=None,\n",
       "              learning_rate=0.300000012, max_delta_step=0, max_depth=6,\n",
       "              min_child_weight=1, missing=nan, monotone_constraints=None,\n",
       "              n_estimators=10, n_jobs=0, num_parallel_tree=1,\n",
       "              objective='binary:logistic', random_state=0, reg_alpha=0,\n",
       "              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,\n",
       "              validate_parameters=False, verbosity=None)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tree_v4.fit(X_train, Y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy del clasificador - version 1 : 0.80\n",
      "matriz de confusión del clasificador - version 1: \n",
      " [[131  15]\n",
      " [ 32  53]]\n",
      "precision del clasificador - version 1 : 0.78\n",
      "recall del clasificador - version 1 : 0.62\n",
      "f1 del clasificador - version 1 : 0.69\n"
     ]
    }
   ],
   "source": [
    "metricas_desempenio(tree_v4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Parámetros que se pueden emplear para evitar sobre ajuste (overfitting) \n",
    "* **n_estimators**: a mayor cantidad de ejemplos, se puede incrementar el valor n_estimators para evitar sobreajuste\n",
    "\n",
    "* **learning_rate**, determina la probabilidad de que un ejemplo sea seleccionado en la siguiente iteracion, se recomienda un valor entre 0.1 - 0.2 para reducir la probabilidad de que se produzca overfitting\n",
    "\n",
    "* **subsample**, permite controlar el tamaño de la fracción del conjunto de entrenamiento  para cada iteración. Mientras más bajo el valor, más probabilidad hay de que los conjuntos de entrenamiento entre iteraciones sean diferentes (a mayor diferencia, menos probabilidad de que se produzca overfitting). Se recomienda valores entre 0.5 - 1.0\n",
    "\n",
    "* **colsample_bytree**, permite controlar la fracción de las variables empleadas para entrenar los árboles en cada iteración. Se recomienda valores entre 0.5 - 1.0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Optimización de parámetros\n",
    "\n",
    "Objetivo: encontrar la mejor combinación de hiper-parámetros para obtener el clasificador con el mejor desempeño.\n",
    "\n",
    "Para evitar probar manualmente todas las posibles combinaciones de valores para todos los posibles parámetros que resultan en un buen desempeño, se emplean técnicas de optimización para evitar buscar en todo el espacio de posible valores y garantizar al mismo tiempo un buen desempeño del clasificador. \n",
    "\n",
    "\n",
    "__[hyperopt (Distributed Hyperparameter Optimization)](https://github.com/hyperopt/hyperopt)__ es el módulo python que facilita realizar esta tarea."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "#conda install -c conda-forge hyperopt\n",
    "from hyperopt import fmin, tpe, hp, STATUS_OK,Trials"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "space = {\n",
    "    'x':hp.quniform('x',-1,1,1), #probar con valores entre -10 - 10, con incrementos de 1 \n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "def objective(params):\n",
    "    x = int(params['x'])\n",
    "    return {'loss':x ** 2,'status':STATUS_OK}   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "trials = Trials()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5/5 [00:00<00:00, 271.82trial/s, best loss: 0.0]\n",
      "{'x': 0.0}\n"
     ]
    }
   ],
   "source": [
    "best = fmin(objective, space, algo=tpe.suggest, trials=trials, max_evals=5)\n",
    "print(best)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Probar valores entre 100 - 1000, con incrementos de 1 - con igual probabilidad de ser seleccionado: \n",
    "#'n_estimators':hp.quniform('n_estimators',100,1000,1)\n",
    "#Crear un diccionario que contiene la configuración para generar diferentes valores para cada parámetro; en este ejemplo,\n",
    "#para el algoritmo XGBClassifier.\n",
    "space =  {\n",
    "    'n_estimators':hp.quniform('n_estimators',100,1000,1), #probar con valores entre 100 - 100, con incrementos de 1 \n",
    "    'learning_rate':hp.quniform('learning_rate',0.025,0.5,0.025),\n",
    "    'max_depth':hp.quniform('max_depth',1,13,1),\n",
    "    'subsample': hp.quniform('subsample',0.5,1,0.05),\n",
    "    'colsample_bytree':hp.quniform('colsample_bytree',0.5,1,0.05),\n",
    "    'nthread':6, #cuando se posible, paralelizar el procesamiento empleando hasta 6 hilos\n",
    "    'silent':1 #si ocurre un error, continuar con la ejecución\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Es necesario definir una función de manera tal que cuando alcance el valor mínimo, esto implique que el clasificador\n",
    "#ha alanzado en mejor desempeño. \n",
    "#En el ejemplo siguiente, el menor valor posible para esta función (0), si se da cuando accuracy = 1.\n",
    "def objective(params):\n",
    "    params['n_estimators'] = int(params['n_estimators'])\n",
    "    params['max_depth'] = int(params['max_depth'])  \n",
    "    clf = XGBClassifier(**params) #https://treyhunner.com/2018/10/asterisks-in-python-what-they-are-and-how-to-use-them/\n",
    "    clf.fit(X_train, Y_train)   \n",
    "    accuracy = accuracy_score(Y_test, clf.predict(X_test))\n",
    "    return {'loss': 1 - accuracy, 'status': STATUS_OK}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100%|██████████| 100/100 [01:55<00:00,  1.16s/trial, best loss: 0.18181818181818177]\n",
      "{'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 3.0, 'n_estimators': 560.0, 'subsample': 0.6000000000000001}\n"
     ]
    }
   ],
   "source": [
    "#https://github.com/hyperopt/hyperopt/wiki/FMin#12-attaching-extra-information-via-the-trials-object\n",
    "#fmin Itera 100 veces y retorna la combinación de parámetros que generan el menor valor para la función 'objective'\n",
    "trials = Trials()\n",
    "best = fmin(objective,space,algo=tpe.suggest,trials=trials,max_evals=100)\n",
    "print(best)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "best['n_estimators'] = int(best['n_estimators'])\n",
    "best['max_depth'] = int(best['max_depth'])                      "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,\n",
       "              colsample_bynode=None, colsample_bytree=0.8, gamma=None,\n",
       "              gpu_id=None, importance_type='gain', interaction_constraints=None,\n",
       "              learning_rate=0.025, max_delta_step=None, max_depth=3,\n",
       "              min_child_weight=None, missing=nan, monotone_constraints=None,\n",
       "              n_estimators=560, n_jobs=None, num_parallel_tree=None,\n",
       "              objective='binary:logistic', random_state=None, reg_alpha=None,\n",
       "              reg_lambda=None, scale_pos_weight=None,\n",
       "              subsample=0.6000000000000001, tree_method=None,\n",
       "              validate_parameters=False, verbosity=None)"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tree_v5 = XGBClassifier(**best)\n",
    "tree_v5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,\n",
       "              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,\n",
       "              importance_type='gain', interaction_constraints=None,\n",
       "              learning_rate=0.025, max_delta_step=0, max_depth=3,\n",
       "              min_child_weight=1, missing=nan, monotone_constraints=None,\n",
       "              n_estimators=560, n_jobs=0, num_parallel_tree=1,\n",
       "              objective='binary:logistic', random_state=0, reg_alpha=0,\n",
       "              reg_lambda=1, scale_pos_weight=1, subsample=0.6000000000000001,\n",
       "              tree_method=None, validate_parameters=False, verbosity=None)"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tree_v5.fit(X_train, Y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy del clasificador - version 1 : 0.82\n",
      "matriz de confusión del clasificador - version 1: \n",
      " [[131  15]\n",
      " [ 27  58]]\n",
      "precision del clasificador - version 1 : 0.79\n",
      "recall del clasificador - version 1 : 0.68\n",
      "f1 del clasificador - version 1 : 0.73\n"
     ]
    }
   ],
   "source": [
    "metricas_desempenio(tree_v5)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}